clear
set mem 6m
use "simulation_data_base", clear
set more off

*gen index number so we can resort below
gen obs_number = _n

gen sim_truenumones = .
gen sim_modelchi =.

gen sim_b_home=.
gen sim_se_home=.
gen sim_b_business=.
gen sim_se_business=.
gen sim_b_person=.
gen sim_se_person=.
gen sim_b_car=.
gen sim_se_car=.
gen sim_b_extent=.
gen sim_se_extent=.
gen sim_b_warrant=.
gen sim_se_warrant=.
gen sim_b_incident=.
gen sim_se_incident=.
gen sim_b_afterlaw=.
gen sim_se_afterlaw=.
gen sim_b_unlawful=.
gen sim_se_unlawful=.
gen sim_b_probcaus=.
gen sim_se_probcaus=.
gen sim_b_except=.
gen sim_se_except=.
gen sim_b_change=.
gen sim_se_change=.

gen sim_chosencases=.
gen sim_randomcases=.

gen sim_truenumones_in = .
gen sim_n_in = .
gen sim_ones_in=.
gen sim_avgabschangeprob_in = .
gen sim_avgchangeprob_in = .
gen sim_caseflips_in=.
gen sim_falsepos_in = .
gen sim_falseneg_in = .

gen sim_n_out = .
gen sim_ones_out=.
gen sim_avgabschangeprob_out = .
gen sim_avgchangeprob_out = .
gen sim_caseflips_out=.
gen sim_falsepos_out = .
gen sim_falseneg_out = .

gen sim_casecat_A=.
gen sim_casecat_B=.
gen sim_casecat_C=.
gen sim_casecat_D=.

gen sim_ones_all=.
gen sim_avgabschangeprob_all = .
gen sim_avgchangeprob_all = .
gen sim_caseflips_all=.
gen sim_falsepos_all = .
gen sim_falseneg_all = .

gen sim_home_avgimpact=.
gen sim_except_avgimpact=.

*GENERATE LATENT (PREDICTED) PROBABILITIES
*set # of replications
forvalues i=1(1)1000{
gen rawscore_actual =  1.97 - 2.77*home - 2.34*business - 1.94*person  - 1.94*car - 1.53*extent + 1.73*warrant + 2.89*incident + 1.03*afterlaw + 0*unlawful - 0*probcaus  + 1.37*except + .29*change
gen latent_actual = invlogit(rawscore_actual)
gen actual_5 = 1 if latent_actual >=.5
replace actual_5 = 0 if latent_actual <.5

*create variable to measure length of dataset
egen length = count(home)

*create new "truth" based on comparing latent probability to (different) uniform random variable
gen random1 = uniform()
gen actual_random = 1 if latent_actual >= random1
replace actual_random = 0 if latent_actual < random1

*count # of "true" 1s
egen tempvar = mean(actual_random)
replace sim_truenumones=tempvar in `i'
drop tempvar

*SELECTING CASES
*anomalous cases.  latent_actual high and outcome is liberal, or latent_actual is low and outcome is conservative
	*note: we want 180 cases to be random, 120 to be selected according to selection criteria

gen random2 = uniform()
sort random2
gen random_case = 1 in 1/180
replace random_case = 0 if random_case ==.
*create indicator for all cases that meet selection criteria
gen chosen_case_population = 1 if   (latent_actual >=.87 & actual_random==0) | (latent_actual <=.36 & actual_random==1) 
tab chosen_case_population
gen random3 = uniform() if chosen_case_population == 1
sort random3 
gen chosen_case = 1 in 1/120
replace chosen_case = 0 if chosen_case ==.
gen selected_case=1 if random_case == 1 | chosen_case ==1
replace selected_case = 0 if selected_case==.

*re-sort by observation number
sort obs_number 

*count # of "true" 1s IN SAMPLE 
gen actual_random_in = actual_random if selected_case == 1
egen tempvar = mean(actual_random_in) 
replace sim_truenumones_in=tempvar in `i'
drop tempvar actual_random_in

*RUN LOGIT
*sets up the selected case logit and predicted outcomes
logit actual_random home business person car extent warrant incident afterlaw unlawful probcaus except change if selected_case ==1, nolog

*get number of observations used in each logit
gen tempvar = e(N)
replace sim_n_in = tempvar in `i'
drop tempvar
*create indicator for cases used in logit (may be different from selected case if observations are dropped)
gen logit_case = 1 if e(sample)
replace logit_case = 0 if logit_case == .

*get counts
*nubmer of random cases
gen logit_case_random = 1 if logit_case == 1 & random_case == 1 
replace logit_case_random = 0 if logit_case_random == .
egen tempvar = sum(logit_case_random)
replace sim_randomcases = tempvar in `i'
drop tempvar

*number of chosen cases
gen logit_case_chosen = 1 if logit_case == 1 & chosen_case == 1 & random_case == 0
replace logit_case_chosen = 0 if logit_case_chosen == .
egen tempvar = sum(logit_case_chosen)
replace sim_chosencases = tempvar in `i'
drop tempvar


*get predicted probs for home and except
adjust home=0, by(home) pr generate(home_low) 
adjust home=1, by(home) pr generate(home_high)
gen home_impact = home_high - home_low
egen tempvar = median(home_impact)
replace sim_home_avgimpact = tempvar in `i'
drop tempvar
drop home_low home_high home_impact

adjust except=0, by(except) pr generate(except_low)
adjust except=1, by(except) pr generate(except_high)
gen except_impact = except_high - except_low
egen tempvar = median(except_impact)
replace sim_except_avgimpact = tempvar in `i'
drop tempvar
drop except_low except_high except_impact

*get model params
*modelchi
gen tempvar = e(chi2)
replace sim_modelchi = tempvar in `i'
drop tempvar

*b home
capture gen tempvar = _b[home]
capture replace sim_b_home = tempvar in `i'
capture drop tempvar
*se home
capture gen tempvar = _se[home]
capture replace sim_se_home = tempvar in `i'
capture  drop tempvar

*b business
capture gen tempvar = _b[business]
capture replace sim_b_business = tempvar in `i'
capture drop tempvar
*se business
capture gen tempvar = _se[business]
capture replace sim_se_business = tempvar in `i'
capture drop tempvar

*b person
capture gen tempvar = _b[person]
capture replace sim_b_person = tempvar in `i'
capture drop tempvar
*se person
capture gen tempvar = _se[person]
capture replace sim_se_person = tempvar in `i'
capture drop tempvar

*b car
capture gen tempvar = _b[car]
capture replace sim_b_car = tempvar in `i'
capture drop tempvar
*se car
capture gen tempvar = _se[car]
capture replace sim_se_car = tempvar in `i'
capture drop tempvar

*b extent
capture gen tempvar = _b[extent]
capture replace sim_b_extent = tempvar in `i'
capture drop tempvar
*se extent
capture gen tempvar = _se[extent]
capture replace sim_se_extent = tempvar in `i'
capture drop tempvar

*b warrant
capture gen tempvar = _b[warrant]
capture replace sim_b_warrant = tempvar in `i'
capture drop tempvar
*se warrant
capture gen tempvar = _se[warrant]
capture replace sim_se_warrant = tempvar in `i'
capture drop tempvar

*b incident
capture gen tempvar = _b[incident]
capture replace sim_b_incident = tempvar in `i'
capture drop tempvar
*se incident
capture gen tempvar = _se[incident]
capture replace sim_se_incident = tempvar in `i'
capture drop tempvar

*b afterlaw
capture gen tempvar = _b[afterlaw]
capture replace sim_b_afterlaw = tempvar in `i'
capture drop tempvar
*se afterlaw
capture gen tempvar = _se[afterlaw]
capture replace sim_se_afterlaw = tempvar in `i'
capture drop tempvar

*b unlawful
capture gen tempvar = _b[unlawful]
capture replace sim_b_unlawful = tempvar in `i'
capture drop tempvar
*se unlawful
capture gen tempvar = _se[unlawful]
capture replace sim_se_unlawful = tempvar in `i'
capture drop tempvar

*b probcaus
capture gen tempvar = _b[probcaus]
capture replace sim_b_probcaus = tempvar in `i'
capture drop tempvar
*se probcaus
capture gen tempvar = _se[probcaus]
capture replace sim_se_probcaus = tempvar in `i'
capture drop tempvar

*b except
capture gen tempvar = _b[except]
capture replace sim_b_except = tempvar in `i'
capture drop tempvar
*se except
capture gen tempvar = _se[except]
capture replace sim_se_except = tempvar in `i'
capture drop tempvar

*b change
capture gen tempvar = _b[change]
capture replace sim_b_change = tempvar in `i'
capture drop tempvar
*se change
capture gen tempvar = _se[change]
capture replace sim_se_change = tempvar in `i'
capture drop tempvar

*IN SAMPLE

*pred in sample; used "logit_cases" to make predictions only on observations that actually entered model
predict latent_selected if logit_case ==1
gen selected_5= 1 if latent_selected>= .5 & logit_case ==1
replace selected_5= 0 if latent_selected< .5 & logit_case ==1

*gets our comparisons
gen abschange_latent= abs(latent_selected- latent_actual) if logit_case ==1
gen change_latent= latent_selected- latent_actual if logit_case ==1
gen caseflip = 1 if selected_5 != actual_random & logit_case ==1
replace caseflip = 0 if selected_5 == actual_random & logit_case ==1
gen falsepos = 1 if actual_random==0 & selected_5 ==1 & logit_case ==1
replace falsepos=0 if falsepos==. & logit_case ==1
gen falseneg = 1 if actual_random==1 & selected_5 ==0 & logit_case ==1
replace falseneg=0 if falseneg==. & logit_case ==1

*above needs to be done for IN, OUT

*number of 1s
*summarize selected_5 if logit_case ==1
egen tempvar = sum(selected_5)
replace sim_ones_in =  tempvar in `i'
drop tempvar

*average abs change in prob
*summarize abschange_latent if logit_case ==1
egen tempvar = mean(abschange_latent)
replace sim_avgabschangeprob_in = tempvar in `i'
drop tempvar

*average change in prob
*summarize change_latent if logit_case ==1
egen tempvar = mean(change_latent)
replace sim_avgchangeprob_in = tempvar in `i'
drop tempvar

*number case flips
*summarize caseflip if logit_case ==1
*tab selected_5 actual_random if logit_case ==1
egen tempvar = sum(caseflip)
replace sim_caseflips_in = tempvar in `i'
drop tempvar

*false positive
egen tempvar = sum(falsepos)
replace sim_falsepos_in = tempvar in `i'
drop tempvar

*false neg
egen tempvar = sum(falseneg)
replace sim_falseneg_in = tempvar in `i'
drop tempvar

drop  latent_selected selected_5 abschange_latent change_latent caseflip falsepos falseneg

*Out cases

*pred out sample
predict latent_selected if logit_case ==0
gen selected_5= 1 if latent_selected>= .5 & logit_case ==0
replace selected_5= 0 if latent_selected< .5 & logit_case ==0


*gets our comparisons
gen abschange_latent= abs(latent_selected- latent_actual) if logit_case ==0
gen change_latent= latent_selected- latent_actual if logit_case ==0
gen caseflip = 1 if selected_5 != actual_random & logit_case ==0
replace caseflip = 0 if selected_5 == actual_random & logit_case ==0
gen falsepos = 1 if actual_random==0 & selected_5 ==1 & logit_case ==0
replace falsepos=0 if falsepos==. & logit_case ==0
gen falseneg = 1 if actual_random==1 & selected_5 ==0 & logit_case ==0
replace falseneg=0 if falseneg==. & logit_case ==0

*get N of out-sample
egen tempvar = sum(logit_case)
replace sim_n_out = length - tempvar in `i'
drop tempvar

*number of 1s
*summarize selected_5 if logit_case ==1
egen tempvar = sum(selected_5)
replace sim_ones_out =  tempvar in `i'
drop tempvar

*average abs change in prob
*summarize abschange_latent if logit_case ==1
egen tempvar = mean(abschange_latent)
replace sim_avgabschangeprob_out = tempvar in `i'
drop tempvar

*average change in prob
*summarize change_latent if logit_case ==1
egen tempvar = mean(change_latent)
replace sim_avgchangeprob_out = tempvar in `i'
drop tempvar

*number case flips
*summarize caseflip if logit_case ==1
*tab selected_5 actual_random if logit_case ==1
egen tempvar = sum(caseflip)
replace sim_caseflips_out = tempvar in `i'
drop tempvar

*false positive
egen tempvar = sum(falsepos)
replace sim_falsepos_out = tempvar in `i'
drop tempvar

*false neg
egen tempvar = sum(falseneg)
replace sim_falseneg_out = tempvar in `i'
drop tempvar

*getting out of sample quantities for numerical examples as in Table 2
*compare true to inferred -- getting number of cases in each category
*A 1 1    B 1 0     C 0 1   D 0 0
gen casecat_A = actual_random * selected_5
gen casecat_B = actual_random * (1-selected_5)
gen casecat_C = (1- actual_random) * selected_5
gen casecat_D = (1- actual_random) * (1- selected_5)
egen tempvar = sum(casecat_A)
replace sim_casecat_A = tempvar in `i'
drop tempvar
egen tempvar = sum(casecat_B)
replace sim_casecat_B = tempvar in `i'
drop tempvar
egen tempvar = sum(casecat_C)
replace sim_casecat_C = tempvar in `i'
drop tempvar
egen tempvar = sum(casecat_D)
replace sim_casecat_D = tempvar in `i'
drop tempvar

drop casecat_A casecat_B casecat_C casecat_D
drop    latent_selected selected_5 abschange_latent change_latent caseflip falsepos falseneg


*All cases

*pred all sample
predict latent_selected 
gen selected_5= 1 if latent_selected>= .5 
replace selected_5= 0 if latent_selected< .5 

*gets our comparisons
gen abschange_latent= abs(latent_selected- latent_actual) 
gen change_latent= latent_selected- latent_actual 
gen caseflip = 1 if selected_5 != actual_random
replace caseflip = 0 if selected_5 == actual_random 
gen falsepos = 1 if actual_random==0 & selected_5 ==1  
replace falsepos=0 if falsepos==.  
gen falseneg = 1 if actual_random==1 & selected_5 ==0  
replace falseneg=0 if falseneg==.  

*get N

*number of 1s
*summarize selected_5 
egen tempvar = sum(selected_5)
replace sim_ones_all =  tempvar in `i'
drop tempvar

*average abs change in prob
*summarize abschange_latent
egen tempvar = mean(abschange_latent)
replace sim_avgabschangeprob_all = tempvar in `i'
drop tempvar

*average change in prob
*summarize change_latent 
egen tempvar = mean(change_latent)
replace sim_avgchangeprob_all = tempvar in `i'
drop tempvar

*number case flips
*summarize caseflip if logit_case ==1
*tab selected_5 actual_random 
egen tempvar = sum(caseflip)
replace sim_caseflips_all = tempvar in `i'
drop tempvar

*false positive
egen tempvar = sum(falsepos)
replace sim_falsepos_all = tempvar in `i'
drop tempvar

*false neg
egen tempvar = sum(falseneg)
replace sim_falseneg_all = tempvar in `i'
drop tempvar

drop rawscore_actual latent_actual actual_5

drop length random1 random2 random3 actual_random chosen_case_population random_case logit_case logit_case_random logit_case_chosen selected_case latent_selected chosen_case selected_5 abschange_latent change_latent caseflip falsepos falseneg
}

keep sim*
keep if  sim_truenumones != .

*change sim outs to percentages
replace sim_casecat_A = sim_casecat_A / sim_n_out
replace sim_casecat_B = sim_casecat_B / sim_n_out
replace sim_casecat_C = sim_casecat_C / sim_n_out
replace sim_casecat_D = sim_casecat_D / sim_n_out
replace sim_caseflips_out = sim_caseflips_out / sim_n_out
replace sim_falsepos_out = sim_falsepos_out / sim_n_out
replace sim_falseneg_out = sim_falseneg_out / sim_n_out

summarize sim_truenumones sim_casecat_A  sim_casecat_B  sim_casecat_C  sim_casecat_D   sim_caseflips_out  sim_falsepos_out sim_falseneg_out sim_n_out

save "sim_results_anomalous_all.dta", replace
